Comparing genomic offset predictions

Published

January 8, 2025

We compare the genomic offset predictions from the four following methods:

Code
# Method names
meth_names <- c("GDM","LFMM","GF","RDA", "pRDA")

# population names
pop_names <- readRDS(here("data/GenomicData/MainGenePoolPopulations.rds")) %>%  arrange(pop) %>% pull(pop)

# SNP sets
snp_sets <- readRDS(here("outputs/list_snp_sets.rds"))

Comparing population ranks

In this section, we compare GO predictions from the different methods (GF, GDM, RDA and LFMM), SNP sets (candidates, control and all SNPs) and GCMs. For that, for each combination of method/SNP set/GCM, we ranked the populations based on the GO predictions, i.e. populations with the lowest rank had the highest GO predictions.

Across methods/SNP sets

We first compared the population ranks from the different combinations method/SNP set. For that, we generated one plot per GCM. We also generated a plot in which the population ranks are not based on an unique GCM but are based on the average GO predictions across all GCMs. In the plots below, we colored the populations that were among the three populations with the highest genomic offset in at least one combination method/SNP set.

Code
#################################################################
# Ranking the populations for each combination method/SNP set/GCM
#################################################################

# Extracting the genomic offset predictions from LFMM, GFM and GF
# ===============================================================
sub_meth_names <- c("LFMM", "GF", "GDM")

go_df <- lapply(sub_meth_names, function(meth_name) {
  
  go_predictions <- readRDS(file=here(paste0("outputs/",meth_name,"/go_predictions.rds")))
  
  go_predictions %>% 
    lapply(function(snp_set) {

    snp_set$go %>% 
        lapply(function(gcm){
          
        tibble(pop = pop_names,
               go = gcm) %>%
            arrange(go) %>% 
            mutate(rank = rev(1:length(pop_names)))
        
      }) %>% bind_rows(.id="gcm")
        
    }) %>% bind_rows(.id="snp_set")


}) %>% 
  setNames(sub_meth_names) %>% 
  bind_rows(.id="method")


# Extracting the genomic offset predictions from the RDA and pRDA
# ===============================================================

# Which method do we use for the RDA? 
RDA_method <- "predict" # "loadings" or "predict"

# loading the genomic offset predictions from the RDA
rda_pred <- readRDS(file=here(paste0("outputs/RDA/go_predictions_",RDA_method,".rds")))

# pRDA (RDA corrected for population structure)
prda_pred <- lapply(names(snp_sets), function(set_i){
  
  rda_pred[[set_i]]$go_corrected %>% 
    
    lapply(function(gcm){ 
      
      tibble(pop = pop_names,
             go = gcm) %>%
             arrange(go) %>% 
             mutate(rank = rev(1:length(pop_names)),
                    snp_set = set_i,
                    method = "pRDA")
        
      }) %>% bind_rows(.id="gcm")
        
    }) %>% bind_rows()

# RDA (RDA not corrected for population structure)
go_df <- lapply(names(snp_sets), function(set_i){
  
  rda_pred[[set_i]]$go_uncorrected %>% 
    
    lapply(function(gcm){ 
      
      tibble(pop = pop_names,
             go = gcm) %>%
             arrange(go) %>% 
             mutate(rank = rev(1:length(pop_names)),
                    snp_set = set_i,
                    method = "RDA")
        
      }) %>% bind_rows(.id="gcm")
        
    }) %>% 
   bind_rows() %>% 
   bind_rows(prda_pred) %>% 
   bind_rows(go_df)


# ============================================================================================================
# ============================================================================================================

# For each combination of method (GF, GDM, LFMM, pRDA and RDA) and SNP set, 
# we calculate the average of the GO predictions across the five GCMs, 
# and then we rank the populations based on their GO predicted value.


# First option (that we should use I think): take the mean of the GO predictions
go_df <- go_df %>% 
  group_by(pop,method,snp_set) %>% 
  summarise(go=mean(go)) %>% 
  ungroup() %>% 
  group_split(method,snp_set) %>% 
  lapply(function(x){
    x %>% 
      arrange(go) %>% 
      mutate(rank = rev(1:length(pop_names)),
             gcm = "GCMs_average")
  }) %>% 
  bind_rows() %>% 
  bind_rows(go_df)

# Another option (the first I use, but I think we should use the option above): take the mean of the ranks
# go_df %>%
#   group_by(pop,method,snp_set) %>%
#   summarise(mean_rank=mean(rank),go=mean(go)) %>%
#   ungroup() %>%
#   group_split(method,snp_set) %>%
#   lapply(function(x){
#     x %>%
#       arrange(mean_rank) %>%
#       mutate(rank = 1:length(pop_names),
#              gcm = "GCM average") %>%
#       dplyr::select(-mean_rank)
#   }) %>%
#   bind_rows() %>%
#   bind_rows(go_df)


# =====================================
# =====================================

# FIGURE TITLE AND LABELS

# We create a vector that we will use for the x-axis labels and the plot title
go_df <- lapply(snp_sets, function(set_i){
  lapply(meth_names, function(meth_i){
    tibble(method_snpset_names = paste0(meth_i, " - ", set_i$set_name),
           meth_snpset = paste0(meth_i, "_", set_i$set_code),
           snpset_names = set_i$set_name,
           method = meth_i,
           snp_set = set_i$set_code)
    })
  }) %>% 
  bind_rows %>% 
  left_join(go_df, by = c("method", "snp_set"))

# save for the shiny app
go_df %>% 
  mutate(gcm = if_else(gcm == "GCMs_average", "Average across GCMs", gcm)) %>% 
  saveRDS(here("shiny/PredictionVariability/go_df.rds"))
Code
# Which method do we show on the graph? 
selected_meth <- meth_names

# Which SNP set do we show? 
selected_sets <- names(snp_sets)[c(1,6)]

# Subseting the dataset to keep only the method and SNP sets that we want for the figures
sub_df <- go_df %>% 
  filter(snp_set %in% selected_sets,
         method %in% selected_meth)
Code
# =================================
# Code the generate the bump charts
# =================================

# the colors I will use to color the populations with high GO
my_colors <- c(brewer.pal(n=12, "Paired"),"#FF40EE","cyan","yellow")

bump_charts <- sub_df %>% 
  #filter(! (snp_set == "cand_corrected")) %>% 
  # mutate(x_axis = method_snpset_names[paste0(method, "_", snp_set)]) %>%
  group_split(gcm) %>% 
  lapply(function(x){
    
high_go_pops <- x %>% filter(rank < 4) %>% pull(pop) %>% unique()    

my_palette <- c(my_colors[1:length(high_go_pops)],"#E8E8E8")

if(unique(x$gcm)=="GCMs_average"){
  plot_title <- "Average across the five GCMs"
} else {
  plot_title <- unique(x$gcm)
}

sub <- x %>% 
  mutate(flag = ifelse(pop %in% high_go_pops, TRUE, FALSE),
         pop_col = if_else(flag == TRUE, pop, "Others")) %>% 
  mutate(pop = factor(pop, levels=c(setdiff(unique(x$pop),high_go_pops),high_go_pops)),
         pop_col = factor(pop_col, levels = c(high_go_pops,"Others")))

p <- sub %>% ggplot(aes(x = method_snpset_names, y = rank, group = pop)) +
  geom_point(aes(color = pop_col), size = 2, alpha = 0.9) +
  geom_line(aes(color = pop_col), linewidth = 2, alpha = 0.8) +
  scale_y_reverse(breaks = 1:nrow(sub)) +
  scale_color_manual(values = my_palette) +
  geom_text(data = sub %>% filter(method_snpset_names == last(levels(factor(sub$method_snpset_names))) & 
                                    pop %in% high_go_pops),
            aes(label = pop, x = length(unique(sub$method_snpset_names)) + 0.1) , 
            hjust = 0.15, color = "gray20", size = 3) +#color = "#888888",
  theme_bw() +
  ylab("Population rank") +
  ggtitle(plot_title) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.major.x = element_blank(),
        legend.position = "none",
        axis.text.y = element_text(size=10),
        axis.text.x = element_text(size=11, angle = 60, hjust = 1),
        axis.title.y = element_text(size=16),
        axis.title.x = element_blank())

ggsave(p, filename = here(paste0("figs/PredictionVariability/BumpChart_PopRank_",unique(x$gcm),".pdf")), device = "pdf", width=11, height = 7)

if(unique(x$gcm)=="GCMs_average"){
  p_manuscript <- p + ggtitle("") 
  ggsave(p_manuscript, 
         filename = here(paste0("figs/PredictionVariability/BumpChart_PopRank_",unique(x$gcm),"_notitle.pdf")), device = "pdf", width=13)} 

p

  })

pdf(here("figs/PredictionVariability/BumpCharts_PerMethSnpSet.pdf"), width=11, height = 7)
lapply(bump_charts, function(x) x)
dev.off()

bump_charts

Across GCMs

We then compared the population ranks from the different GCMs (or from the average GO predictions across the five GCMs). For that, we generated one plot per combination method/SNP set. We also generated a plot in which the population ranks are not based on an unique combination method/SNP set but are based on the average GO predictions across all combinations method/SNP set.

In the plots below, we colored the populations that were among the three populations with the highest genomic offset in at least one GCM.

Code
bump_charts <- sub_df %>% 
  group_by(pop, gcm) %>% 
  summarise(go = mean(go)) %>% 
  ungroup() %>% 
  group_split(gcm) %>% 
  lapply(function(x){
    x %>% 
      arrange(go) %>% 
      mutate(rank = rev(1:length(pop_names)))
  }) %>% 
  bind_rows() %>% 
  mutate(meth_snpset = "method_snpset_average",
         method_snpset_names = "Average across methods and SNP sets") %>% 
  bind_rows(sub_df) %>% 
  group_split(meth_snpset) %>% 
  lapply(function(x){

high_go_pops <- x %>% 
  filter(rank < 4) %>% 
  pull(pop) %>% 
  unique()    

my_palette <- c(my_colors[1:length(high_go_pops)],"#E8E8E8")

df <- x %>% 
  mutate(flag = ifelse(pop %in% high_go_pops, TRUE, FALSE),
         pop_col = if_else(flag == TRUE, pop, "Others")) %>% 
  mutate(pop = factor(pop, levels=c(setdiff(unique(sub_df$pop),high_go_pops),high_go_pops)),
         pop_col = factor(pop_col, levels = c(high_go_pops,"Others")))


p <- df %>% 
  ggplot(aes(x = gcm, y = rank, group = pop)) +
  geom_point(aes(color = pop_col), size = 2, alpha = 0.9) +
  geom_line(aes(color = pop_col), linewidth = 2, alpha = 0.8) +
  scale_y_reverse(breaks = 1:nrow(sub_df)) +
  scale_color_manual(values = my_palette) +
  geom_text(data = df %>% filter(gcm == "UKESM1-0-LL" & pop %in% high_go_pops),
            aes(label = pop, x = 6.1) , hjust = 0.15, color = "gray20", size = 5) +#color = "#888888",
  theme_bw() +
  ylab("Population rank") +
  ggtitle(df$method_snpset_names) +
  theme(panel.grid.minor.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.major.x = element_blank(),
        axis.text.y = element_text(size=12),
        axis.text.x = element_text(size=12),
        axis.title.y = element_text(size=16),
        legend.position = "none",
        axis.title.x = element_blank())

ggsave(p, filename = here(paste0("figs/PredictionVariability/BumpChart_PopRank_",unique(x$meth_snpset),".pdf")), device = "pdf", width=10)

p
  })
`summarise()` has grouped output by 'pop'. You can override using the `.groups` argument.
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Saving 10 x 7 in image
Code
pdf(here("figs/PredictionVariability/BumpCharts_PerGCM.pdf"), width=10)
lapply(bump_charts, function(x) x)
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]

[[6]]

[[7]]

[[8]]

[[9]]

[[10]]

[[11]]
Code
dev.off()
png 
  2 
Code
bump_charts
[[1]]


[[2]]


[[3]]


[[4]]


[[5]]


[[6]]


[[7]]


[[8]]


[[9]]


[[10]]


[[11]]

Correlations

Code
sub_df %>% 
  group_split(gcm) %>% 
  lapply(function(x){
    
    df_cor <- x %>% 
      pivot_wider(id_cols = pop, values_from = go, names_from = method_snpset_names)  %>% 
      dplyr::select(-pop) %>% 
      cor()
    
    
    pdf(here(paste0("figs/PredictionVariability/CorrelationPlot_",paste0(unique(x$gcm)),".pdf")))
    df_cor  %>% corrplot(method = 'number',type = 'lower', diag = FALSE,
             title="",
             mar=c(0,0,0,0),
             number.cex=1,
             tl.cex=1)
    dev.off()
      
      
    df_cor %>% corrplot(method = 'number',type = 'lower', diag = FALSE,
               title=paste0(unique(x$gcm)),mar=c(0,0,2,0),
               number.cex=1,
               tl.cex=1)
  })

Maps

Code
list_highpops <- list(LFMM = readRDS(here("outputs/LFMM/high_go_pops.rds"))[[2]],
                      RDA = readRDS(here("outputs/RDA/high_go_pops_predict.rds"))[[2]],
                      GDM = readRDS(here("outputs/GDM/high_go_pops.rds"))[[2]],
                      GF = readRDS(here("outputs/GF/high_go_pops_common_cand.rds"))[[2]])

list_highpops[[1]] <- list_highpops[[1]] + theme(legend.position = "none")
list_highpops[[2]] <- list_highpops[[2]] + theme(legend.position = "none")
list_highpops[[3]] <- list_highpops[[3]] + theme(legend.position = "none")

plot_grid(plotlist = list_highpops, nrow=2)
Code
# Function to make the genomic offset maps
source(here("scripts/functions/make_go_map.R"))

# Population coordinates
pop_coord <-  readRDS(here(paste0("data/ClimaticData/MaritimePinePops/ClimatePopulationLocationPointEstimates_ReferencePeriods_noADJ.rds")))[[1]]$ref_means %>% dplyr::select(pop,longitude,latitude)

go_df_mean <- go_df %>% 
  filter(gcm == "GCMs_average" & snp_set == "all_cand") %>% 
  group_by(pop) %>%
  summarize(mean_rank = mean(rank, na.rm = TRUE), .groups = "drop") %>% 
  left_join(pop_coord, by="pop")

p <- ggplot() + 
    geom_sf(data = world, fill="gray98") + 
    theme_bw() +
    scale_x_continuous(limits = c(-10, 12)) +
    scale_y_continuous(limits = c(33, 50)) + 
    geom_point(data=go_df_mean, aes(x=longitude, y=latitude, color=mean_rank), size= 3) + 
    xlab("Longitude") + ylab("Latitude") +
    theme_bw(base_size = 11) +
    theme(panel.grid = element_blank(), 
          plot.background = element_blank(), 
          panel.background = element_blank(), 
          legend.position = c(0.8,0.15),
          legend.box.background = element_rect(colour = "gray80"),
          legend.title = element_text(size=12),
          legend.text = element_text(size=10),
          strip.text = element_text(size=11)) +
    scale_color_gradientn(name = "Mean rank", colours = rainbow(5))
Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2 3.5.0.
ℹ Please use the `legend.position.inside` argument of `theme()` instead.
Code
p %>% ggsave(here(paste0("figs/GOmeanPredictions_AllCandidateSNPs_AcrossGOMethods.pdf")),., width=6,height=6, device="pdf")
p %>% ggsave(here(paste0("figs/GF/GOmeanProjections_AllCandidateSNPs_AcrossGOMethods.png")),., width=6,height=6)

p

Climatic distance

We build a dataset with both the genomic offset predictions and the climatic distances.

Code
# Set of select climatic variables
clim_var <- readRDS(here("data/ClimaticData/NamesSelectedVariables.rds"))

# Loading the climatic datasets (which are mean-centered and standardized)
source(here("scripts/functions/generate_scaled_clim_datasets.R"))
clim_dfs <- generate_scaled_clim_datasets(clim_var)

# We calculate the climatic distances for each GCM
climdist <- lapply(clim_dfs$clim_pred, function(x){
  
  # Calculate absolute differences 
  df <- abs(x[,clim_var] - clim_dfs$clim_ref[,clim_var]) %>% 
    as_tibble() %>% 
    bind_cols(x[c("pop","gcm")],.) %>% 
    mutate(eucli = unique(sqrt(rowSums((clim_dfs$clim_ref[,clim_var] - x[,clim_var])^2))))
}) %>% bind_rows()

# We calcualte the climatic distances for the average across GCMs
climdist <- climdist %>%
  group_by(pop) %>%
  summarize(across(where(is.numeric), \(x) mean(x, na.rm = TRUE))) %>% 
  mutate(gcm="GCMs_average") %>% 
  bind_rows(climdist) %>% 
  pivot_longer(cols = all_of(c(clim_var,"eucli")), names_to = "var_code", values_to = "val") %>% 
  mutate(var_name = case_when(var_code == "bio1" ~ "Mean annual temperature (bio1, °C)",
                              var_code == "bio12" ~ "Annual precipitation (bio12, mm)",
                              var_code == "bio15" ~ "Precipitation seasonality (bio15, index)",
                              var_code == "bio3" ~ "Isothermality (bio3, index)",
                              var_code == "bio4" ~ "Temperature seasonality (bio4, °C)",
                              var_code == "SHM" ~ "Summer heat moisture index (SHM, °C/mm)",
                              var_code == "eucli" ~ "Euclidean climatic distance"),
         method_name = "Climatic distance")

# We merge the climatic distances with the genomic offset predictions
go_climdist_df <- go_df %>% 
  dplyr::rename(method_name = method,
                var_code = snp_set,
                var_name = snpset_names,
                val = go) %>% 
  dplyr::select(-method_snpset_names, -meth_snpset, -rank) %>% 
  bind_rows(climdist) %>% 
  mutate(gcm = if_else(gcm == "GCMs_average", "Average across GCMs", gcm)) %>% 
  left_join(readRDS(here("data/GenomicData/MainGenePoolInformation.rds"))[[1]], by="pop")

# We export the dataset for the shinny app
go_climdist_df %>% saveRDS(here("shiny/PredictionVariability/go_climdist_df.rds"))